//
// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
//
// SPDX-License-Identifier: Apache-2.0
//

// Do not flag up inline assembly blocks
#pragma GCC diagnostic ignored "-Woverlength-strings"

#if !defined(__aarch64__) || !defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
#error This file must be compiled for AArch64, FEAT_BF16.
#else  // Architectural features check.

#include "kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.h"

#include <stddef.h>
#include <stdint.h>

#include "kai/kai_common.h"

static const size_t kai_mr = 1;
static const size_t kai_nr = 12;
static const size_t kai_kr = 4;
static const size_t kai_sr = 1;

static const size_t kai_m_step = 1;
static const size_t kai_n_step = 36;

size_t kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_m_step;
}

size_t kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_n_step;
}

size_t kai_get_mr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_mr;
}

size_t kai_get_nr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_nr;
}

size_t kai_get_kr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_kr;
}

size_t kai_get_sr_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(void) {
    return kai_sr;
}

size_t kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m_idx, size_t k) {
    KAI_ASSUME(m_idx == 0);

    return m_idx * kai_roundup(k, kai_kr) * sizeof(uint16_t);
}

size_t kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t n_idx, size_t k) {
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);
    return n_idx * (kai_roundup(k, kai_kr) * sizeof(uint16_t) + sizeof(float));
}

size_t kai_get_dst_offset_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(
    size_t m_idx, size_t n_idx, size_t dst_stride) {
    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);
    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot() == 0);

    return (m_idx * dst_stride) + (n_idx * sizeof(float));
}

size_t kai_get_dst_size_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(size_t m, size_t n) {
    return m * n * sizeof(float);
}

void kai_run_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot(
    size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
    size_t dst_stride_col, float clamp_min, float clamp_max) {
    KAI_UNUSED(dst_stride_row);
    KAI_UNUSED(dst_stride_col);

    KAI_ASSUME(m == 1);

    typedef struct {
        float maxval;
        float minval;
    } KernelArgs;

    KernelArgs ka;
    ka.maxval = clamp_max;
    ka.minval = clamp_min;

    size_t N = n;
    size_t K = k;

    const void* A_ptr = lhs_packed;
    const void* B_ptr = rhs_packed;
    void* output_ptr = dst;

    uint64_t flags = 0;

    __asm__ __volatile__(
        "add x26, %x[K], #0x3\n"
        "mov x20, #0xc\n"
        "bic x26, x26, #0x3\n"
        "add x25, %x[N], #0x3\n"
        "lsr x25, x25, #0x2\n"
        "lsl x26, x26, #0x1\n"
        "add x26, x26, #0x4\n"
        "mul x26, x26, x20\n"
        "1:"  // Column loop
        "cmp x25, #0x9\n"
        "bge 89f\n"
        "cmp x25, #0x7\n"
        "bgt 78f\n"
        "beq 67f\n"
        "cmp x25, #0x5\n"
        "bgt 56f\n"
        "beq 45f\n"
        "cmp x25, #0x3\n"
        "bgt 34f\n"
        "beq 23f\n"
        "cmp x25, #0x1\n"
        "bgt 12f\n"
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "mov x24, %x[K]\n"
        "movi v16.16b, #0x0\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v15.4s, v14.4s, v16.4s\n"
        "zip1 v14.4s, v14.4s, v16.4s\n"
        "blt 4f\n"
        "cmp x24, #0x8\n"
        "blt 3f\n"
        "2:"  // Width 1: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "cmp x24, #0x8\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        "bge 2b\n"
        "3:"  // Width 1: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q3, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q4, [%x[B_ptr], #0x10]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc6e  // bfdot v14.4s, v3.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        ".inst 0x6e40fc8f  // bfdot v15.4s, v4.8h, v0.8h\n"
        "4:"  // Width 1: Multiply loop: Main loop skip
        "cbz x24, 7f\n"
        "tbz x24, #1, 5f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 6f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 6f\n"
        "5:"  // Width 1: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "6:"  // Width 1: Multiply loop: Ragged operand read: Done
        "ldr q5, [%x[B_ptr], #0x0]\n"
        "ldr q6, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        ".inst 0x6e40fcae  // bfdot v14.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fccf  // bfdot v15.4s, v6.8h, v0.8h\n"
        "7:"  // Width 1: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "tbz %x[flags], #1, 8f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.4s }, [x21]\n"
        "ld1r { v16.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v17.4s\n"
        "fmax v14.4s, v14.4s, v16.4s\n"
        "8:"  // Width 1: No activation
        "cmp %x[N], #0x4\n"
        "blt 9f\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 11f\n"
        "9:"  // Width 1: Partial writeback
        "tbz %x[N], #1, 10f\n"
        "str d14, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 11f\n"
        "st1 { v14.s }[2], [%x[output_ptr]]\n"
        "b 11f\n"
        "10:"  // Width 1: Partial direct writeback: partial_1_0
        "str s14, [%x[output_ptr], #0x0]\n"
        "11:"  // Width 1: Writeback done
        "b 100f\n"
        "12:"  // Width 2
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "movi v18.16b, #0x0\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v17.4s, v15.4s, v18.4s\n"
        "zip1 v16.4s, v15.4s, v18.4s\n"
        "zip2 v15.4s, v14.4s, v18.4s\n"
        "zip1 v14.4s, v14.4s, v18.4s\n"
        "blt 15f\n"
        "cmp x24, #0x8\n"
        "blt 14f\n"
        "13:"  // Width 2: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        "bge 13b\n"
        "14:"  // Width 2: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q5, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q6, [%x[B_ptr], #0x10]\n"
        "ldr q7, [%x[B_ptr], #0x20]\n"
        "ldr q8, [%x[B_ptr], #0x30]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fcae  // bfdot v14.4s, v5.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fccf  // bfdot v15.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf0  // bfdot v16.4s, v7.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        ".inst 0x6e40fd11  // bfdot v17.4s, v8.8h, v0.8h\n"
        "15:"  // Width 2: Multiply loop: Main loop skip
        "cbz x24, 18f\n"
        "tbz x24, #1, 16f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 17f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 17f\n"
        "16:"  // Width 2: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "17:"  // Width 2: Multiply loop: Ragged operand read: Done
        "ldr q9, [%x[B_ptr], #0x0]\n"
        "ldr q10, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q11, [%x[B_ptr], #0x20]\n"
        "ldr q12, [%x[B_ptr], #0x30]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        ".inst 0x6e40fd2e  // bfdot v14.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd4f  // bfdot v15.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd70  // bfdot v16.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd91  // bfdot v17.4s, v12.8h, v0.8h\n"
        "18:"  // Width 2: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "tbz %x[flags], #1, 19f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v17.4s }, [x21]\n"
        "ld1r { v16.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v17.4s\n"
        "fmin v15.4s, v15.4s, v17.4s\n"
        "fmax v14.4s, v14.4s, v16.4s\n"
        "fmax v15.4s, v15.4s, v16.4s\n"
        "19:"  // Width 2: No activation
        "cmp %x[N], #0x8\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "blt 20f\n"
        "str q15, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 22f\n"
        "20:"  // Width 2: Partial writeback
        "tbz %x[N], #1, 21f\n"
        "str d15, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 22f\n"
        "st1 { v15.s }[2], [%x[output_ptr]]\n"
        "b 22f\n"
        "21:"  // Width 2: Partial direct writeback: partial_1_4
        "tbz %x[N], #0, 22f\n"
        "str s15, [%x[output_ptr], #0x0]\n"
        "22:"  // Width 2: Writeback done
        "b 100f\n"
        "23:"  // Width 3
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "movi v20.16b, #0x0\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v17.4s, v15.4s, v20.4s\n"
        "zip2 v19.4s, v16.4s, v20.4s\n"
        "zip1 v18.4s, v16.4s, v20.4s\n"
        "zip1 v16.4s, v15.4s, v20.4s\n"
        "zip2 v15.4s, v14.4s, v20.4s\n"
        "zip1 v14.4s, v14.4s, v20.4s\n"
        "blt 26f\n"
        "cmp x24, #0x8\n"
        "blt 25f\n"
        "24:"  // Width 3: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 24b\n"
        "25:"  // Width 3: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q7, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q8, [%x[B_ptr], #0x10]\n"
        "ldr q9, [%x[B_ptr], #0x20]\n"
        "ldr q10, [%x[B_ptr], #0x30]\n"
        "ldr q11, [%x[B_ptr], #0x40]\n"
        "ldr q12, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fcee  // bfdot v14.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd0f  // bfdot v15.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd30  // bfdot v16.4s, v9.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fd51  // bfdot v17.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd72  // bfdot v18.4s, v11.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fd93  // bfdot v19.4s, v12.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "26:"  // Width 3: Multiply loop: Main loop skip
        "cbz x24, 29f\n"
        "tbz x24, #1, 27f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 28f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 28f\n"
        "27:"  // Width 3: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "28:"  // Width 3: Multiply loop: Ragged operand read: Done
        "ldr q13, [%x[B_ptr], #0x0]\n"
        "ldr q1, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q2, [%x[B_ptr], #0x20]\n"
        "ldr q3, [%x[B_ptr], #0x30]\n"
        "ldr q4, [%x[B_ptr], #0x40]\n"
        "ldr q5, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        ".inst 0x6e40fdae  // bfdot v14.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc2f  // bfdot v15.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc50  // bfdot v16.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc71  // bfdot v17.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc92  // bfdot v18.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb3  // bfdot v19.4s, v5.8h, v0.8h\n"
        "29:"  // Width 3: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "tbz %x[flags], #1, 30f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v18.4s }, [x21]\n"
        "ld1r { v17.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v18.4s\n"
        "fmin v15.4s, v15.4s, v18.4s\n"
        "fmin v16.4s, v16.4s, v18.4s\n"
        "fmax v14.4s, v14.4s, v17.4s\n"
        "fmax v15.4s, v15.4s, v17.4s\n"
        "fmax v16.4s, v16.4s, v17.4s\n"
        "30:"  // Width 3: No activation
        "cmp %x[N], #0xc\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "add %x[output_ptr], %x[output_ptr], #0x20\n"
        "blt 31f\n"
        "str q16, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 33f\n"
        "31:"  // Width 3: Partial writeback
        "tbz %x[N], #1, 32f\n"
        "str d16, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 33f\n"
        "st1 { v16.s }[2], [%x[output_ptr]]\n"
        "b 33f\n"
        "32:"  // Width 3: Partial direct writeback: partial_1_8
        "tbz %x[N], #0, 33f\n"
        "str s16, [%x[output_ptr], #0x0]\n"
        "33:"  // Width 3: Writeback done
        "b 100f\n"
        "34:"  // Width 4
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x20, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x20, #0x0]\n"
        "movi v22.16b, #0x0\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "add x20, x20, #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v21.4s, v17.4s, v22.4s\n"
        "zip1 v20.4s, v17.4s, v22.4s\n"
        "zip2 v19.4s, v16.4s, v22.4s\n"
        "zip1 v18.4s, v16.4s, v22.4s\n"
        "zip2 v17.4s, v15.4s, v22.4s\n"
        "zip1 v16.4s, v15.4s, v22.4s\n"
        "zip2 v15.4s, v14.4s, v22.4s\n"
        "zip1 v14.4s, v14.4s, v22.4s\n"
        "blt 37f\n"
        "cmp x24, #0x8\n"
        "blt 36f\n"
        "35:"  // Width 4: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "ldr q8, [x20, #0x10]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 35b\n"
        "36:"  // Width 4: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q9, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q10, [%x[B_ptr], #0x10]\n"
        "ldr q11, [%x[B_ptr], #0x20]\n"
        "ldr q12, [%x[B_ptr], #0x30]\n"
        "ldr q13, [%x[B_ptr], #0x40]\n"
        "ldr q1, [%x[B_ptr], #0x50]\n"
        "ldr q2, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fd2e  // bfdot v14.4s, v9.8h, v0.8h\n"
        "ldr q3, [x20, #0x10]\n"
        ".inst 0x6e40fd4f  // bfdot v15.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd70  // bfdot v16.4s, v11.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fd91  // bfdot v17.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdb2  // bfdot v18.4s, v13.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fc33  // bfdot v19.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc54  // bfdot v20.4s, v2.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fc75  // bfdot v21.4s, v3.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "37:"  // Width 4: Multiply loop: Main loop skip
        "cbz x24, 40f\n"
        "tbz x24, #1, 38f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 39f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 39f\n"
        "38:"  // Width 4: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "39:"  // Width 4: Multiply loop: Ragged operand read: Done
        "ldr q4, [%x[B_ptr], #0x0]\n"
        "ldr q5, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q6, [%x[B_ptr], #0x20]\n"
        "ldr q7, [%x[B_ptr], #0x30]\n"
        "ldr q8, [%x[B_ptr], #0x40]\n"
        "ldr q9, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q10, [x20, #0x0]\n"
        "ldr q11, [x20, #0x10]\n"
        ".inst 0x6e40fc8e  // bfdot v14.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcaf  // bfdot v15.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd0  // bfdot v16.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf1  // bfdot v17.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd12  // bfdot v18.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd33  // bfdot v19.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd54  // bfdot v20.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd75  // bfdot v21.4s, v11.8h, v0.8h\n"
        "40:"  // Width 4: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "tbz %x[flags], #1, 41f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v19.4s }, [x21]\n"
        "ld1r { v18.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v19.4s\n"
        "fmin v15.4s, v15.4s, v19.4s\n"
        "fmin v16.4s, v16.4s, v19.4s\n"
        "fmin v17.4s, v17.4s, v19.4s\n"
        "fmax v14.4s, v14.4s, v18.4s\n"
        "fmax v15.4s, v15.4s, v18.4s\n"
        "fmax v16.4s, v16.4s, v18.4s\n"
        "fmax v17.4s, v17.4s, v18.4s\n"
        "41:"  // Width 4: No activation
        "cmp %x[N], #0x10\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "add %x[output_ptr], %x[output_ptr], #0x30\n"
        "blt 42f\n"
        "str q17, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 44f\n"
        "42:"  // Width 4: Partial writeback
        "tbz %x[N], #1, 43f\n"
        "str d17, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 44f\n"
        "st1 { v17.s }[2], [%x[output_ptr]]\n"
        "b 44f\n"
        "43:"  // Width 4: Partial direct writeback: partial_1_12
        "tbz %x[N], #0, 44f\n"
        "str s17, [%x[output_ptr], #0x0]\n"
        "44:"  // Width 4: Writeback done
        "b 100f\n"
        "45:"  // Width 5
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x20, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x20, #0x0]\n"
        "movi v24.16b, #0x0\n"
        "mov x23, %x[A_ptr]\n"
        "ldr q18, [x20, #0x10]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "add x20, x20, #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v21.4s, v17.4s, v24.4s\n"
        "zip1 v20.4s, v17.4s, v24.4s\n"
        "zip2 v19.4s, v16.4s, v24.4s\n"
        "zip2 v17.4s, v15.4s, v24.4s\n"
        "zip2 v23.4s, v18.4s, v24.4s\n"
        "zip1 v22.4s, v18.4s, v24.4s\n"
        "zip1 v18.4s, v16.4s, v24.4s\n"
        "zip1 v16.4s, v15.4s, v24.4s\n"
        "zip2 v15.4s, v14.4s, v24.4s\n"
        "zip1 v14.4s, v14.4s, v24.4s\n"
        "blt 48f\n"
        "cmp x24, #0x8\n"
        "blt 47f\n"
        "46:"  // Width 5: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "ldr q8, [x20, #0x10]\n"
        "ldr q9, [x20, #0x20]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "ldr q10, [x20, #0x30]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd36  // bfdot v22.4s, v9.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fd57  // bfdot v23.4s, v10.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 46b\n"
        "47:"  // Width 5: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q11, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q12, [%x[B_ptr], #0x10]\n"
        "ldr q13, [%x[B_ptr], #0x20]\n"
        "ldr q1, [%x[B_ptr], #0x30]\n"
        "ldr q2, [%x[B_ptr], #0x40]\n"
        "ldr q3, [%x[B_ptr], #0x50]\n"
        "ldr q4, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fd6e  // bfdot v14.4s, v11.8h, v0.8h\n"
        "ldr q5, [x20, #0x10]\n"
        "ldr q6, [x20, #0x20]\n"
        ".inst 0x6e40fd8f  // bfdot v15.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdb0  // bfdot v16.4s, v13.8h, v0.8h\n"
        "ldr q7, [x20, #0x30]\n"
        ".inst 0x6e40fc31  // bfdot v17.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc52  // bfdot v18.4s, v2.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fc73  // bfdot v19.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc94  // bfdot v20.4s, v4.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fcb5  // bfdot v21.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd6  // bfdot v22.4s, v6.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fcf7  // bfdot v23.4s, v7.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "48:"  // Width 5: Multiply loop: Main loop skip
        "cbz x24, 51f\n"
        "tbz x24, #1, 49f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 50f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 50f\n"
        "49:"  // Width 5: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "50:"  // Width 5: Multiply loop: Ragged operand read: Done
        "ldr q8, [%x[B_ptr], #0x0]\n"
        "ldr q9, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q10, [%x[B_ptr], #0x20]\n"
        "ldr q11, [%x[B_ptr], #0x30]\n"
        "ldr q12, [%x[B_ptr], #0x40]\n"
        "ldr q13, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q1, [x20, #0x0]\n"
        "ldr q2, [x20, #0x10]\n"
        ".inst 0x6e40fd0e  // bfdot v14.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd2f  // bfdot v15.4s, v9.8h, v0.8h\n"
        "ldr q3, [x20, #0x20]\n"
        "ldr q4, [x20, #0x30]\n"
        ".inst 0x6e40fd50  // bfdot v16.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd71  // bfdot v17.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd92  // bfdot v18.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdb3  // bfdot v19.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc34  // bfdot v20.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc55  // bfdot v21.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc76  // bfdot v22.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc97  // bfdot v23.4s, v4.8h, v0.8h\n"
        "51:"  // Width 5: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "faddp v18.4s, v22.4s, v23.4s\n"
        "tbz %x[flags], #1, 52f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v20.4s }, [x21]\n"
        "ld1r { v19.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v20.4s\n"
        "fmin v15.4s, v15.4s, v20.4s\n"
        "fmin v16.4s, v16.4s, v20.4s\n"
        "fmin v17.4s, v17.4s, v20.4s\n"
        "fmin v18.4s, v18.4s, v20.4s\n"
        "fmax v14.4s, v14.4s, v19.4s\n"
        "fmax v15.4s, v15.4s, v19.4s\n"
        "fmax v16.4s, v16.4s, v19.4s\n"
        "fmax v17.4s, v17.4s, v19.4s\n"
        "fmax v18.4s, v18.4s, v19.4s\n"
        "52:"  // Width 5: No activation
        "cmp %x[N], #0x14\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "str q17, [%x[output_ptr], #0x30]\n"
        "add %x[output_ptr], %x[output_ptr], #0x40\n"
        "blt 53f\n"
        "str q18, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 55f\n"
        "53:"  // Width 5: Partial writeback
        "tbz %x[N], #1, 54f\n"
        "str d18, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 55f\n"
        "st1 { v18.s }[2], [%x[output_ptr]]\n"
        "b 55f\n"
        "54:"  // Width 5: Partial direct writeback: partial_1_16
        "tbz %x[N], #0, 55f\n"
        "str s18, [%x[output_ptr], #0x0]\n"
        "55:"  // Width 5: Writeback done
        "b 100f\n"
        "56:"  // Width 6
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x20, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x20, #0x0]\n"
        "movi v26.16b, #0x0\n"
        "mov x23, %x[A_ptr]\n"
        "ldr q18, [x20, #0x10]\n"
        "ldr q19, [x20, #0x20]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "add x20, x20, #0x30\n"
        "cmp x24, #0x4\n"
        "zip2 v21.4s, v17.4s, v26.4s\n"
        "zip1 v20.4s, v17.4s, v26.4s\n"
        "zip2 v17.4s, v15.4s, v26.4s\n"
        "zip2 v25.4s, v19.4s, v26.4s\n"
        "zip1 v24.4s, v19.4s, v26.4s\n"
        "zip2 v23.4s, v18.4s, v26.4s\n"
        "zip1 v22.4s, v18.4s, v26.4s\n"
        "zip2 v19.4s, v16.4s, v26.4s\n"
        "zip1 v18.4s, v16.4s, v26.4s\n"
        "zip1 v16.4s, v15.4s, v26.4s\n"
        "zip2 v15.4s, v14.4s, v26.4s\n"
        "zip1 v14.4s, v14.4s, v26.4s\n"
        "blt 59f\n"
        "cmp x24, #0x8\n"
        "blt 58f\n"
        "57:"  // Width 6: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "ldr q8, [x20, #0x10]\n"
        "ldr q9, [x20, #0x20]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "ldr q10, [x20, #0x30]\n"
        "ldr q11, [x20, #0x40]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "ldr q12, [x20, #0x50]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd36  // bfdot v22.4s, v9.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fd57  // bfdot v23.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd78  // bfdot v24.4s, v11.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fd99  // bfdot v25.4s, v12.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 57b\n"
        "58:"  // Width 6: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q13, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q1, [%x[B_ptr], #0x10]\n"
        "ldr q2, [%x[B_ptr], #0x20]\n"
        "ldr q3, [%x[B_ptr], #0x30]\n"
        "ldr q4, [%x[B_ptr], #0x40]\n"
        "ldr q5, [%x[B_ptr], #0x50]\n"
        "ldr q6, [x20, #0x0]\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        ".inst 0x6e40fdae  // bfdot v14.4s, v13.8h, v0.8h\n"
        "ldr q7, [x20, #0x10]\n"
        "ldr q8, [x20, #0x20]\n"
        ".inst 0x6e40fc2f  // bfdot v15.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc50  // bfdot v16.4s, v2.8h, v0.8h\n"
        "ldr q9, [x20, #0x30]\n"
        "ldr q10, [x20, #0x40]\n"
        ".inst 0x6e40fc71  // bfdot v17.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc92  // bfdot v18.4s, v4.8h, v0.8h\n"
        "ldr q11, [x20, #0x50]\n"
        ".inst 0x6e40fcb3  // bfdot v19.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd4  // bfdot v20.4s, v6.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        ".inst 0x6e40fcf5  // bfdot v21.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd16  // bfdot v22.4s, v8.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        ".inst 0x6e40fd37  // bfdot v23.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd58  // bfdot v24.4s, v10.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        ".inst 0x6e40fd79  // bfdot v25.4s, v11.8h, v0.8h\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "59:"  // Width 6: Multiply loop: Main loop skip
        "cbz x24, 62f\n"
        "tbz x24, #1, 60f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 61f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 61f\n"
        "60:"  // Width 6: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "61:"  // Width 6: Multiply loop: Ragged operand read: Done
        "ldr q12, [%x[B_ptr], #0x0]\n"
        "ldr q13, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q1, [%x[B_ptr], #0x20]\n"
        "ldr q2, [%x[B_ptr], #0x30]\n"
        "ldr q3, [%x[B_ptr], #0x40]\n"
        "ldr q4, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q5, [x20, #0x0]\n"
        "ldr q6, [x20, #0x10]\n"
        ".inst 0x6e40fd8e  // bfdot v14.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdaf  // bfdot v15.4s, v13.8h, v0.8h\n"
        "ldr q7, [x20, #0x20]\n"
        "ldr q8, [x20, #0x30]\n"
        ".inst 0x6e40fc30  // bfdot v16.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc51  // bfdot v17.4s, v2.8h, v0.8h\n"
        "ldr q9, [x20, #0x40]\n"
        "ldr q10, [x20, #0x50]\n"
        ".inst 0x6e40fc72  // bfdot v18.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc93  // bfdot v19.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb4  // bfdot v20.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd5  // bfdot v21.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf6  // bfdot v22.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd17  // bfdot v23.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd38  // bfdot v24.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd59  // bfdot v25.4s, v10.8h, v0.8h\n"
        "62:"  // Width 6: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "faddp v18.4s, v22.4s, v23.4s\n"
        "faddp v19.4s, v24.4s, v25.4s\n"
        "tbz %x[flags], #1, 63f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v21.4s }, [x21]\n"
        "ld1r { v20.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v21.4s\n"
        "fmin v15.4s, v15.4s, v21.4s\n"
        "fmin v16.4s, v16.4s, v21.4s\n"
        "fmin v17.4s, v17.4s, v21.4s\n"
        "fmin v18.4s, v18.4s, v21.4s\n"
        "fmin v19.4s, v19.4s, v21.4s\n"
        "fmax v14.4s, v14.4s, v20.4s\n"
        "fmax v15.4s, v15.4s, v20.4s\n"
        "fmax v16.4s, v16.4s, v20.4s\n"
        "fmax v17.4s, v17.4s, v20.4s\n"
        "fmax v18.4s, v18.4s, v20.4s\n"
        "fmax v19.4s, v19.4s, v20.4s\n"
        "63:"  // Width 6: No activation
        "cmp %x[N], #0x18\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "str q17, [%x[output_ptr], #0x30]\n"
        "str q18, [%x[output_ptr], #0x40]\n"
        "add %x[output_ptr], %x[output_ptr], #0x50\n"
        "blt 64f\n"
        "str q19, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 66f\n"
        "64:"  // Width 6: Partial writeback
        "tbz %x[N], #1, 65f\n"
        "str d19, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 66f\n"
        "st1 { v19.s }[2], [%x[output_ptr]]\n"
        "b 66f\n"
        "65:"  // Width 6: Partial direct writeback: partial_1_20
        "tbz %x[N], #0, 66f\n"
        "str s19, [%x[output_ptr], #0x0]\n"
        "66:"  // Width 6: Writeback done
        "b 100f\n"
        "67:"  // Width 7
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x21, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x21, #0x0]\n"
        "add x20, %x[B_ptr], x26, LSL #1\n"
        "movi v28.16b, #0x0\n"
        "ldr q18, [x21, #0x10]\n"
        "ldr q19, [x21, #0x20]\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "ldr q20, [x20, #0x0]\n"
        "cmp x24, #0x4\n"
        "add x21, x21, #0x30\n"
        "add x20, x20, #0x30\n"
        "zip2 v21.4s, v17.4s, v28.4s\n"
        "zip2 v25.4s, v19.4s, v28.4s\n"
        "zip1 v24.4s, v19.4s, v28.4s\n"
        "zip2 v23.4s, v18.4s, v28.4s\n"
        "zip2 v27.4s, v20.4s, v28.4s\n"
        "zip1 v26.4s, v20.4s, v28.4s\n"
        "zip1 v22.4s, v18.4s, v28.4s\n"
        "zip1 v20.4s, v17.4s, v28.4s\n"
        "zip2 v19.4s, v16.4s, v28.4s\n"
        "zip1 v18.4s, v16.4s, v28.4s\n"
        "zip2 v17.4s, v15.4s, v28.4s\n"
        "zip1 v16.4s, v15.4s, v28.4s\n"
        "zip2 v15.4s, v14.4s, v28.4s\n"
        "zip1 v14.4s, v14.4s, v28.4s\n"
        "blt 70f\n"
        "cmp x24, #0x8\n"
        "blt 69f\n"
        "68:"  // Width 7: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x21, #0x0]\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q8, [x21, #0x10]\n"
        "ldr q9, [x21, #0x20]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "ldr q10, [x21, #0x30]\n"
        "ldr q11, [x21, #0x40]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "ldr q12, [x21, #0x50]\n"
        "ldr q13, [x20, #0x0]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "ldr q1, [x20, #0x10]\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd36  // bfdot v22.4s, v9.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fd57  // bfdot v23.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd78  // bfdot v24.4s, v11.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fd99  // bfdot v25.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdba  // bfdot v26.4s, v13.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc3b  // bfdot v27.4s, v1.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 68b\n"
        "69:"  // Width 7: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q2, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q3, [%x[B_ptr], #0x10]\n"
        "ldr q4, [%x[B_ptr], #0x20]\n"
        "ldr q5, [%x[B_ptr], #0x30]\n"
        "ldr q6, [%x[B_ptr], #0x40]\n"
        "ldr q7, [%x[B_ptr], #0x50]\n"
        "ldr q8, [x21, #0x0]\n"
        ".inst 0x6e40fc4e  // bfdot v14.4s, v2.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q9, [x21, #0x10]\n"
        "ldr q10, [x21, #0x20]\n"
        ".inst 0x6e40fc6f  // bfdot v15.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc90  // bfdot v16.4s, v4.8h, v0.8h\n"
        "ldr q11, [x21, #0x30]\n"
        "ldr q12, [x21, #0x40]\n"
        ".inst 0x6e40fcb1  // bfdot v17.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd2  // bfdot v18.4s, v6.8h, v0.8h\n"
        "ldr q13, [x21, #0x50]\n"
        "ldr q1, [x20, #0x0]\n"
        ".inst 0x6e40fcf3  // bfdot v19.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd14  // bfdot v20.4s, v8.8h, v0.8h\n"
        "ldr q2, [x20, #0x10]\n"
        ".inst 0x6e40fd35  // bfdot v21.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd56  // bfdot v22.4s, v10.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fd77  // bfdot v23.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd98  // bfdot v24.4s, v12.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fdb9  // bfdot v25.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc3a  // bfdot v26.4s, v1.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc5b  // bfdot v27.4s, v2.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "70:"  // Width 7: Multiply loop: Main loop skip
        "cbz x24, 73f\n"
        "tbz x24, #1, 71f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 72f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 72f\n"
        "71:"  // Width 7: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "72:"  // Width 7: Multiply loop: Ragged operand read: Done
        "ldr q3, [%x[B_ptr], #0x0]\n"
        "ldr q4, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q5, [%x[B_ptr], #0x20]\n"
        "ldr q6, [%x[B_ptr], #0x30]\n"
        "ldr q7, [%x[B_ptr], #0x40]\n"
        "ldr q8, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q9, [x21, #0x0]\n"
        "ldr q10, [x21, #0x10]\n"
        ".inst 0x6e40fc6e  // bfdot v14.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc8f  // bfdot v15.4s, v4.8h, v0.8h\n"
        "ldr q11, [x21, #0x20]\n"
        "ldr q12, [x21, #0x30]\n"
        ".inst 0x6e40fcb0  // bfdot v16.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd1  // bfdot v17.4s, v6.8h, v0.8h\n"
        "ldr q13, [x21, #0x40]\n"
        "ldr q1, [x21, #0x50]\n"
        ".inst 0x6e40fcf2  // bfdot v18.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd13  // bfdot v19.4s, v8.8h, v0.8h\n"
        "ldr q2, [x20, #0x0]\n"
        "ldr q3, [x20, #0x10]\n"
        ".inst 0x6e40fd34  // bfdot v20.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd55  // bfdot v21.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd76  // bfdot v22.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd97  // bfdot v23.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdb8  // bfdot v24.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc39  // bfdot v25.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc5a  // bfdot v26.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc7b  // bfdot v27.4s, v3.8h, v0.8h\n"
        "73:"  // Width 7: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "faddp v18.4s, v22.4s, v23.4s\n"
        "faddp v19.4s, v24.4s, v25.4s\n"
        "faddp v20.4s, v26.4s, v27.4s\n"
        "tbz %x[flags], #1, 74f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v22.4s }, [x21]\n"
        "ld1r { v21.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v22.4s\n"
        "fmin v15.4s, v15.4s, v22.4s\n"
        "fmin v16.4s, v16.4s, v22.4s\n"
        "fmin v17.4s, v17.4s, v22.4s\n"
        "fmin v18.4s, v18.4s, v22.4s\n"
        "fmin v19.4s, v19.4s, v22.4s\n"
        "fmin v20.4s, v20.4s, v22.4s\n"
        "fmax v14.4s, v14.4s, v21.4s\n"
        "fmax v15.4s, v15.4s, v21.4s\n"
        "fmax v16.4s, v16.4s, v21.4s\n"
        "fmax v17.4s, v17.4s, v21.4s\n"
        "fmax v18.4s, v18.4s, v21.4s\n"
        "fmax v19.4s, v19.4s, v21.4s\n"
        "fmax v20.4s, v20.4s, v21.4s\n"
        "74:"  // Width 7: No activation
        "cmp %x[N], #0x1c\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "str q17, [%x[output_ptr], #0x30]\n"
        "str q18, [%x[output_ptr], #0x40]\n"
        "str q19, [%x[output_ptr], #0x50]\n"
        "add %x[output_ptr], %x[output_ptr], #0x60\n"
        "blt 75f\n"
        "str q20, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 77f\n"
        "75:"  // Width 7: Partial writeback
        "tbz %x[N], #1, 76f\n"
        "str d20, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 77f\n"
        "st1 { v20.s }[2], [%x[output_ptr]]\n"
        "b 77f\n"
        "76:"  // Width 7: Partial direct writeback: partial_1_24
        "tbz %x[N], #0, 77f\n"
        "str s20, [%x[output_ptr], #0x0]\n"
        "77:"  // Width 7: Writeback done
        "b 100f\n"
        "78:"  // Width 8
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x21, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x21, #0x0]\n"
        "add x20, %x[B_ptr], x26, LSL #1\n"
        "movi v30.16b, #0x0\n"
        "ldr q18, [x21, #0x10]\n"
        "ldr q19, [x21, #0x20]\n"
        "mov x23, %x[A_ptr]\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "ldr q20, [x20, #0x0]\n"
        "ldr q21, [x20, #0x10]\n"
        "cmp x24, #0x4\n"
        "add x21, x21, #0x30\n"
        "add x20, x20, #0x30\n"
        "zip2 v25.4s, v19.4s, v30.4s\n"
        "zip1 v24.4s, v19.4s, v30.4s\n"
        "zip2 v23.4s, v18.4s, v30.4s\n"
        "zip1 v22.4s, v18.4s, v30.4s\n"
        "zip2 v29.4s, v21.4s, v30.4s\n"
        "zip1 v28.4s, v21.4s, v30.4s\n"
        "zip2 v27.4s, v20.4s, v30.4s\n"
        "zip1 v26.4s, v20.4s, v30.4s\n"
        "zip2 v21.4s, v17.4s, v30.4s\n"
        "zip1 v20.4s, v17.4s, v30.4s\n"
        "zip2 v19.4s, v16.4s, v30.4s\n"
        "zip1 v18.4s, v16.4s, v30.4s\n"
        "zip2 v17.4s, v15.4s, v30.4s\n"
        "zip1 v16.4s, v15.4s, v30.4s\n"
        "zip2 v15.4s, v14.4s, v30.4s\n"
        "zip1 v14.4s, v14.4s, v30.4s\n"
        "blt 81f\n"
        "cmp x24, #0x8\n"
        "blt 80f\n"
        "79:"  // Width 8: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x21, #0x0]\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q8, [x21, #0x10]\n"
        "ldr q9, [x21, #0x20]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "ldr q10, [x21, #0x30]\n"
        "ldr q11, [x21, #0x40]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "ldr q12, [x21, #0x50]\n"
        "ldr q13, [x20, #0x0]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "ldr q1, [x20, #0x10]\n"
        "ldr q2, [x20, #0x20]\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd36  // bfdot v22.4s, v9.8h, v0.8h\n"
        "ldr q3, [x20, #0x30]\n"
        ".inst 0x6e40fd57  // bfdot v23.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd78  // bfdot v24.4s, v11.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fd99  // bfdot v25.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdba  // bfdot v26.4s, v13.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fc3b  // bfdot v27.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc5c  // bfdot v28.4s, v2.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fc7d  // bfdot v29.4s, v3.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 79b\n"
        "80:"  // Width 8: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q4, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q5, [%x[B_ptr], #0x10]\n"
        "ldr q6, [%x[B_ptr], #0x20]\n"
        "ldr q7, [%x[B_ptr], #0x30]\n"
        "ldr q8, [%x[B_ptr], #0x40]\n"
        "ldr q9, [%x[B_ptr], #0x50]\n"
        "ldr q10, [x21, #0x0]\n"
        ".inst 0x6e40fc8e  // bfdot v14.4s, v4.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q11, [x21, #0x10]\n"
        "ldr q12, [x21, #0x20]\n"
        ".inst 0x6e40fcaf  // bfdot v15.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcd0  // bfdot v16.4s, v6.8h, v0.8h\n"
        "ldr q13, [x21, #0x30]\n"
        "ldr q1, [x21, #0x40]\n"
        ".inst 0x6e40fcf1  // bfdot v17.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd12  // bfdot v18.4s, v8.8h, v0.8h\n"
        "ldr q2, [x21, #0x50]\n"
        "ldr q3, [x20, #0x0]\n"
        ".inst 0x6e40fd33  // bfdot v19.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd54  // bfdot v20.4s, v10.8h, v0.8h\n"
        "ldr q4, [x20, #0x10]\n"
        "ldr q5, [x20, #0x20]\n"
        ".inst 0x6e40fd75  // bfdot v21.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd96  // bfdot v22.4s, v12.8h, v0.8h\n"
        "ldr q6, [x20, #0x30]\n"
        ".inst 0x6e40fdb7  // bfdot v23.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc38  // bfdot v24.4s, v1.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fc59  // bfdot v25.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc7a  // bfdot v26.4s, v3.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fc9b  // bfdot v27.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcbc  // bfdot v28.4s, v5.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fcdd  // bfdot v29.4s, v6.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "81:"  // Width 8: Multiply loop: Main loop skip
        "cbz x24, 84f\n"
        "tbz x24, #1, 82f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 83f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 83f\n"
        "82:"  // Width 8: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "83:"  // Width 8: Multiply loop: Ragged operand read: Done
        "ldr q7, [%x[B_ptr], #0x0]\n"
        "ldr q8, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q9, [%x[B_ptr], #0x20]\n"
        "ldr q10, [%x[B_ptr], #0x30]\n"
        "ldr q11, [%x[B_ptr], #0x40]\n"
        "ldr q12, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q13, [x21, #0x0]\n"
        "ldr q1, [x21, #0x10]\n"
        ".inst 0x6e40fcee  // bfdot v14.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd0f  // bfdot v15.4s, v8.8h, v0.8h\n"
        "ldr q2, [x21, #0x20]\n"
        "ldr q3, [x21, #0x30]\n"
        ".inst 0x6e40fd30  // bfdot v16.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd51  // bfdot v17.4s, v10.8h, v0.8h\n"
        "ldr q4, [x21, #0x40]\n"
        "ldr q5, [x21, #0x50]\n"
        ".inst 0x6e40fd72  // bfdot v18.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd93  // bfdot v19.4s, v12.8h, v0.8h\n"
        "ldr q6, [x20, #0x0]\n"
        "ldr q7, [x20, #0x10]\n"
        ".inst 0x6e40fdb4  // bfdot v20.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc35  // bfdot v21.4s, v1.8h, v0.8h\n"
        "ldr q8, [x20, #0x20]\n"
        "ldr q9, [x20, #0x30]\n"
        ".inst 0x6e40fc56  // bfdot v22.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc77  // bfdot v23.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc98  // bfdot v24.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb9  // bfdot v25.4s, v5.8h, v0.8h\n"
        ".inst 0x6e40fcda  // bfdot v26.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcfb  // bfdot v27.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd1c  // bfdot v28.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd3d  // bfdot v29.4s, v9.8h, v0.8h\n"
        "84:"  // Width 8: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "faddp v18.4s, v22.4s, v23.4s\n"
        "faddp v19.4s, v24.4s, v25.4s\n"
        "faddp v20.4s, v26.4s, v27.4s\n"
        "faddp v21.4s, v28.4s, v29.4s\n"
        "tbz %x[flags], #1, 85f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v23.4s }, [x21]\n"
        "ld1r { v22.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v23.4s\n"
        "fmin v15.4s, v15.4s, v23.4s\n"
        "fmin v16.4s, v16.4s, v23.4s\n"
        "fmin v17.4s, v17.4s, v23.4s\n"
        "fmin v18.4s, v18.4s, v23.4s\n"
        "fmin v19.4s, v19.4s, v23.4s\n"
        "fmin v20.4s, v20.4s, v23.4s\n"
        "fmin v21.4s, v21.4s, v23.4s\n"
        "fmax v14.4s, v14.4s, v22.4s\n"
        "fmax v15.4s, v15.4s, v22.4s\n"
        "fmax v16.4s, v16.4s, v22.4s\n"
        "fmax v17.4s, v17.4s, v22.4s\n"
        "fmax v18.4s, v18.4s, v22.4s\n"
        "fmax v19.4s, v19.4s, v22.4s\n"
        "fmax v20.4s, v20.4s, v22.4s\n"
        "fmax v21.4s, v21.4s, v22.4s\n"
        "85:"  // Width 8: No activation
        "cmp %x[N], #0x20\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "str q17, [%x[output_ptr], #0x30]\n"
        "str q18, [%x[output_ptr], #0x40]\n"
        "str q19, [%x[output_ptr], #0x50]\n"
        "str q20, [%x[output_ptr], #0x60]\n"
        "add %x[output_ptr], %x[output_ptr], #0x70\n"
        "blt 86f\n"
        "str q21, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 88f\n"
        "86:"  // Width 8: Partial writeback
        "tbz %x[N], #1, 87f\n"
        "str d21, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 88f\n"
        "st1 { v21.s }[2], [%x[output_ptr]]\n"
        "b 88f\n"
        "87:"  // Width 8: Partial direct writeback: partial_1_28
        "tbz %x[N], #0, 88f\n"
        "str s21, [%x[output_ptr], #0x0]\n"
        "88:"  // Width 8: Writeback done
        "b 100f\n"
        "89:"  // Width 9
        "ldr q14, [%x[B_ptr], #0x0]\n"
        "ldr q15, [%x[B_ptr], #0x10]\n"
        "mov x24, %x[K]\n"
        "add x21, %x[B_ptr], x26\n"
        "ldr q16, [%x[B_ptr], #0x20]\n"
        "ldr q17, [x21, #0x0]\n"
        "add x20, %x[B_ptr], x26, LSL #1\n"
        "movi v0.16b, #0x0\n"
        "ldr q18, [x21, #0x10]\n"
        "ldr q19, [x21, #0x20]\n"
        "mov x23, %x[A_ptr]\n"
        "add x22, x20, x26\n"
        "ldr q20, [x20, #0x0]\n"
        "ldr q21, [x20, #0x10]\n"
        "cmp x24, #0x4\n"
        "add %x[B_ptr], %x[B_ptr], #0x30\n"
        "ldr q22, [x20, #0x20]\n"
        "add x21, x21, #0x30\n"
        "add x20, x20, #0x30\n"
        "zip2 v25.4s, v19.4s, v0.4s\n"
        "zip1 v24.4s, v19.4s, v0.4s\n"
        "zip2 v23.4s, v18.4s, v0.4s\n"
        "zip2 v29.4s, v21.4s, v0.4s\n"
        "zip1 v28.4s, v21.4s, v0.4s\n"
        "zip2 v31.4s, v22.4s, v0.4s\n"
        "zip1 v30.4s, v22.4s, v0.4s\n"
        "zip2 v27.4s, v20.4s, v0.4s\n"
        "zip1 v26.4s, v20.4s, v0.4s\n"
        "zip1 v22.4s, v18.4s, v0.4s\n"
        "zip2 v21.4s, v17.4s, v0.4s\n"
        "zip1 v20.4s, v17.4s, v0.4s\n"
        "zip2 v19.4s, v16.4s, v0.4s\n"
        "zip1 v18.4s, v16.4s, v0.4s\n"
        "zip2 v17.4s, v15.4s, v0.4s\n"
        "zip1 v16.4s, v15.4s, v0.4s\n"
        "zip2 v15.4s, v14.4s, v0.4s\n"
        "zip1 v14.4s, v14.4s, v0.4s\n"
        "blt 92f\n"
        "cmp x24, #0x8\n"
        "blt 91f\n"
        "90:"  // Width 9: Multiply loop: Main loop head
        "ld1r { v0.2d }, [x23]\n"
        "ldr q1, [%x[B_ptr], #0x0]\n"
        "sub x24, x24, #0x4\n"
        "add x23, x23, #0x8\n"
        "ldr q2, [%x[B_ptr], #0x10]\n"
        "ldr q3, [%x[B_ptr], #0x20]\n"
        "cmp x24, #0x8\n"
        "ldr q4, [%x[B_ptr], #0x30]\n"
        "ldr q5, [%x[B_ptr], #0x40]\n"
        "ldr q6, [%x[B_ptr], #0x50]\n"
        "ldr q7, [x21, #0x0]\n"
        ".inst 0x6e40fc2e  // bfdot v14.4s, v1.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q8, [x21, #0x10]\n"
        "ldr q9, [x21, #0x20]\n"
        ".inst 0x6e40fc4f  // bfdot v15.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc70  // bfdot v16.4s, v3.8h, v0.8h\n"
        "ldr q10, [x21, #0x30]\n"
        "ldr q11, [x21, #0x40]\n"
        ".inst 0x6e40fc91  // bfdot v17.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb2  // bfdot v18.4s, v5.8h, v0.8h\n"
        "ldr q12, [x21, #0x50]\n"
        "ldr q13, [x20, #0x0]\n"
        ".inst 0x6e40fcd3  // bfdot v19.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf4  // bfdot v20.4s, v7.8h, v0.8h\n"
        "ldr q1, [x20, #0x10]\n"
        "ldr q2, [x20, #0x20]\n"
        ".inst 0x6e40fd15  // bfdot v21.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd36  // bfdot v22.4s, v9.8h, v0.8h\n"
        "ldr q3, [x20, #0x30]\n"
        "ldr q4, [x20, #0x40]\n"
        ".inst 0x6e40fd57  // bfdot v23.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd78  // bfdot v24.4s, v11.8h, v0.8h\n"
        "ldr q5, [x20, #0x50]\n"
        ".inst 0x6e40fd99  // bfdot v25.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdba  // bfdot v26.4s, v13.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fc3b  // bfdot v27.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc5c  // bfdot v28.4s, v2.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fc7d  // bfdot v29.4s, v3.8h, v0.8h\n"
        ".inst 0x6e40fc9e  // bfdot v30.4s, v4.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fcbf  // bfdot v31.4s, v5.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "bge 90b\n"
        "91:"  // Width 9: Multiply loop: Single iteration only
        "ld1r { v0.2d }, [x23]\n"
        "ldr q6, [%x[B_ptr], #0x0]\n"
        "add x23, x23, #0x8\n"
        "sub x24, x24, #0x4\n"
        "ldr q7, [%x[B_ptr], #0x10]\n"
        "ldr q8, [%x[B_ptr], #0x20]\n"
        "ldr q9, [%x[B_ptr], #0x30]\n"
        "ldr q10, [%x[B_ptr], #0x40]\n"
        "ldr q11, [%x[B_ptr], #0x50]\n"
        "ldr q12, [x21, #0x0]\n"
        ".inst 0x6e40fcce  // bfdot v14.4s, v6.8h, v0.8h\n"
        "add %x[B_ptr], %x[B_ptr], #0x60\n"
        "ldr q13, [x21, #0x10]\n"
        "ldr q1, [x21, #0x20]\n"
        ".inst 0x6e40fcef  // bfdot v15.4s, v7.8h, v0.8h\n"
        ".inst 0x6e40fd10  // bfdot v16.4s, v8.8h, v0.8h\n"
        "ldr q2, [x21, #0x30]\n"
        "ldr q3, [x21, #0x40]\n"
        ".inst 0x6e40fd31  // bfdot v17.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd52  // bfdot v18.4s, v10.8h, v0.8h\n"
        "ldr q4, [x21, #0x50]\n"
        "ldr q5, [x20, #0x0]\n"
        ".inst 0x6e40fd73  // bfdot v19.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd94  // bfdot v20.4s, v12.8h, v0.8h\n"
        "ldr q6, [x20, #0x10]\n"
        "ldr q7, [x20, #0x20]\n"
        ".inst 0x6e40fdb5  // bfdot v21.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc36  // bfdot v22.4s, v1.8h, v0.8h\n"
        "ldr q8, [x20, #0x30]\n"
        "ldr q9, [x20, #0x40]\n"
        ".inst 0x6e40fc57  // bfdot v23.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc78  // bfdot v24.4s, v3.8h, v0.8h\n"
        "ldr q10, [x20, #0x50]\n"
        ".inst 0x6e40fc99  // bfdot v25.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcba  // bfdot v26.4s, v5.8h, v0.8h\n"
        "add x21, x21, #0x60\n"
        ".inst 0x6e40fcdb  // bfdot v27.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcfc  // bfdot v28.4s, v7.8h, v0.8h\n"
        "add x20, x20, #0x60\n"
        "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
        ".inst 0x6e40fd1d  // bfdot v29.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd3e  // bfdot v30.4s, v9.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
        "prfm pldl1keep, [%x[B_ptr], #0x480]\n"
        ".inst 0x6e40fd5f  // bfdot v31.4s, v10.8h, v0.8h\n"
        "prfm pldl1keep, [%x[B_ptr], #0x4c0]\n"
        "prfm pldl1keep, [x23, #0x80]\n"
        "92:"  // Width 9: Multiply loop: Main loop skip
        "cbz x24, 95f\n"
        "tbz x24, #1, 93f\n"
        "ldr s0, [x23], #0x4\n"
        "tbz x24, #0, 94f\n"
        "ld1 { v0.h }[2], [x23]\n"
        "b 94f\n"
        "93:"  // Width 9: Multiply loop: Ragged operand read: partial_1_0
        "ldr h0, [x23, #0x0]\n"
        "94:"  // Width 9: Multiply loop: Ragged operand read: Done
        "ldr q11, [%x[B_ptr], #0x0]\n"
        "ldr q12, [%x[B_ptr], #0x10]\n"
        "dup v0.2d, v0.d[0]\n"
        "ldr q13, [%x[B_ptr], #0x20]\n"
        "ldr q1, [%x[B_ptr], #0x30]\n"
        "ldr q2, [%x[B_ptr], #0x40]\n"
        "ldr q3, [%x[B_ptr], #0x50]\n"
        "add %x[B_ptr], %x[B_ptr], #0x90\n"
        "ldr q4, [x21, #0x0]\n"
        "ldr q5, [x21, #0x10]\n"
        ".inst 0x6e40fd6e  // bfdot v14.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd8f  // bfdot v15.4s, v12.8h, v0.8h\n"
        "ldr q6, [x21, #0x20]\n"
        "ldr q7, [x21, #0x30]\n"
        ".inst 0x6e40fdb0  // bfdot v16.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc31  // bfdot v17.4s, v1.8h, v0.8h\n"
        "ldr q8, [x21, #0x40]\n"
        "ldr q9, [x21, #0x50]\n"
        ".inst 0x6e40fc52  // bfdot v18.4s, v2.8h, v0.8h\n"
        ".inst 0x6e40fc73  // bfdot v19.4s, v3.8h, v0.8h\n"
        "ldr q10, [x20, #0x0]\n"
        "ldr q11, [x20, #0x10]\n"
        ".inst 0x6e40fc94  // bfdot v20.4s, v4.8h, v0.8h\n"
        ".inst 0x6e40fcb5  // bfdot v21.4s, v5.8h, v0.8h\n"
        "ldr q12, [x20, #0x20]\n"
        "ldr q13, [x20, #0x30]\n"
        ".inst 0x6e40fcd6  // bfdot v22.4s, v6.8h, v0.8h\n"
        ".inst 0x6e40fcf7  // bfdot v23.4s, v7.8h, v0.8h\n"
        "ldr q1, [x20, #0x40]\n"
        "ldr q2, [x20, #0x50]\n"
        ".inst 0x6e40fd18  // bfdot v24.4s, v8.8h, v0.8h\n"
        ".inst 0x6e40fd39  // bfdot v25.4s, v9.8h, v0.8h\n"
        ".inst 0x6e40fd5a  // bfdot v26.4s, v10.8h, v0.8h\n"
        ".inst 0x6e40fd7b  // bfdot v27.4s, v11.8h, v0.8h\n"
        ".inst 0x6e40fd9c  // bfdot v28.4s, v12.8h, v0.8h\n"
        ".inst 0x6e40fdbd  // bfdot v29.4s, v13.8h, v0.8h\n"
        ".inst 0x6e40fc3e  // bfdot v30.4s, v1.8h, v0.8h\n"
        ".inst 0x6e40fc5f  // bfdot v31.4s, v2.8h, v0.8h\n"
        "95:"  // Width 9: Multiply loop: No odd multiplies
        "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
        "faddp v14.4s, v14.4s, v15.4s\n"
        "faddp v15.4s, v16.4s, v17.4s\n"
        "faddp v16.4s, v18.4s, v19.4s\n"
        "faddp v17.4s, v20.4s, v21.4s\n"
        "faddp v18.4s, v22.4s, v23.4s\n"
        "faddp v19.4s, v24.4s, v25.4s\n"
        "faddp v20.4s, v26.4s, v27.4s\n"
        "faddp v21.4s, v28.4s, v29.4s\n"
        "faddp v22.4s, v30.4s, v31.4s\n"
        "tbz %x[flags], #1, 96f\n"
        "add x21, %x[args_ptr], %[offset_max]\n"
        "add x20, %x[args_ptr], %[offset_min]\n"
        "ld1r { v24.4s }, [x21]\n"
        "ld1r { v23.4s }, [x20]\n"
        "fmin v14.4s, v14.4s, v24.4s\n"
        "fmin v15.4s, v15.4s, v24.4s\n"
        "fmin v16.4s, v16.4s, v24.4s\n"
        "fmin v17.4s, v17.4s, v24.4s\n"
        "fmin v18.4s, v18.4s, v24.4s\n"
        "fmin v19.4s, v19.4s, v24.4s\n"
        "fmin v20.4s, v20.4s, v24.4s\n"
        "fmin v21.4s, v21.4s, v24.4s\n"
        "fmin v22.4s, v22.4s, v24.4s\n"
        "fmax v14.4s, v14.4s, v23.4s\n"
        "fmax v15.4s, v15.4s, v23.4s\n"
        "fmax v16.4s, v16.4s, v23.4s\n"
        "fmax v17.4s, v17.4s, v23.4s\n"
        "fmax v18.4s, v18.4s, v23.4s\n"
        "fmax v19.4s, v19.4s, v23.4s\n"
        "fmax v20.4s, v20.4s, v23.4s\n"
        "fmax v21.4s, v21.4s, v23.4s\n"
        "fmax v22.4s, v22.4s, v23.4s\n"
        "96:"  // Width 9: No activation
        "cmp %x[N], #0x24\n"
        "str q14, [%x[output_ptr], #0x0]\n"
        "str q15, [%x[output_ptr], #0x10]\n"
        "str q16, [%x[output_ptr], #0x20]\n"
        "str q17, [%x[output_ptr], #0x30]\n"
        "str q18, [%x[output_ptr], #0x40]\n"
        "str q19, [%x[output_ptr], #0x50]\n"
        "str q20, [%x[output_ptr], #0x60]\n"
        "str q21, [%x[output_ptr], #0x70]\n"
        "add %x[output_ptr], %x[output_ptr], #0x80\n"
        "blt 97f\n"
        "str q22, [%x[output_ptr], #0x0]\n"
        "add %x[output_ptr], %x[output_ptr], #0x10\n"
        "b 99f\n"
        "97:"  // Width 9: Partial writeback
        "tbz %x[N], #1, 98f\n"
        "str d22, [%x[output_ptr]], #0x8\n"
        "tbz %x[N], #0, 99f\n"
        "st1 { v22.s }[2], [%x[output_ptr]]\n"
        "b 99f\n"
        "98:"  // Width 9: Partial direct writeback: partial_1_32
        "tbz %x[N], #0, 99f\n"
        "str s22, [%x[output_ptr], #0x0]\n"
        "99:"  // Width 9: Writeback done
        "subs x25, x25, #0x9\n"
        "mov %x[B_ptr], x22\n"
        "sub %x[N], %x[N], #0x24\n"
        "bgt 1b\n"
        "100:"  // Exit
        : [B_ptr] "+&r"(B_ptr), [N] "+&r"(N), [output_ptr] "+&r"(output_ptr)
        : [A_ptr] "r"(A_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags),
          [offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval))
        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
          "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26");
}

#endif  // Architectural features check.
